#1) script joining character and metadata
#2) script for predicting character gender

library(gender)
library(dplyr)
library(readr)
library(strin)
library(tidyr)
library(stringi)

character_csv <- read_csv("/path/to/character_data.csv")
metadata_csv <- read_csv("/path/to/metadata.csv")

##join character data and metadata
##remove accents for gender package
character_metadata <- character_csv %>%
  left_join(metadata_csv, by = "filename") %>%
  select(filename, name, relation, lemma, publ_date, predicted_author_gender, characterization )%>%
  mutate(name = stri_trans_general(name, "Latin-ASCII"))


#get characters published before 1930 (necessary for gender prediction that is year sensitive)
character_before_1930 <- character_metadata %>%
  filter(publ_date < 1930) 

gender_bf1930 <- gender(character_before_1930$name, method="ipums")%>%
  distinct()

character_gender_bf1930 <- character_before_1930 %>%
  left_join(gender_bf1930, by='name') %>%
  rename(character_gender = gender) %>%
  select(filename, name, relation, lemma, publ_date, predicted_author_gender, character_gender, characterization)

#get characters published after 1930
character_after_1930 <- character_metadata %>%
  filter(publ_date > 1930) %>%
  distinct()

gender_af1930 <- gender(character_after_1930$name, method="ssa")%>%
  distinct()

character_gender_af1930 <- character_after_1930 %>%
  left_join(gender_af1930, by='name') %>%
  rename(character_gender = gender) %>%
  select(filename, name, relation, lemma, publ_date, predicted_author_gender, character_gender, characterization) 

#dictionaries for gendered nouns
mNouns <- c("actor","bachelor","boy","groom","brother","dad","duke",
            "man","emperor","father","gentleman", "god","grandfather","hero","husband","king","nephew","prince",
            "son","stepson","uncle","lord", "mr", "mister")
fNouns <-c("actress","spinster","girl","bride","sister","countess","duchess",
           "woman","empress","mother","lady", "goddess","grandmother","heroine","wife","queen","niece","princess",
           "daughter","stepdaughter","aunt","lady", "ms", "mrs", "miss")

#combine dataframes
char_gen_combine <- bind_rows(character_gender_bf1930, character_gender_af1930) %>%
  mutate(character_gender = gsub("female", "F", character_gender)) %>% 
  mutate(character_gender = gsub("male", "M", character_gender)) %>%
  separate(name, c("temp1", "temp2"), sep=" ") %>% 
  mutate(character_gender = ifelse(temp1 %in% mNouns, "M", character_gender)) %>%
  mutate(character_gender = ifelse(temp2 %in% mNouns, "M", character_gender)) %>%
  mutate(character_gender = ifelse(temp1 %in% fNouns, "F", character_gender)) %>%
  mutate(character_gender = ifelse(temp2 %in% fNouns, "F", character_gender)) %>%
  unite("name", temp1:temp2, sep = " ") %>%
  mutate(name = gsub("NA", "", name)) %>%
  arrange(name, characterization) %>%
  mutate(character_gender = ifelse(is.na(character_gender), "U", character_gender))

#count lemmas for each character, for each file
char_gen_final <- char_gen_combine %>%
  group_by(filename, name, lemma) %>%
  add_count(name = "count") %>%
  distinct() %>%
  arrange(name, characterization, desc(count))


write_csv(char_gen_final, "/Users/cheng/OneDrive/Desktop/character/character/1_character_gender/1_output.csv")
  
  
  
  
  
  
  





